Developer CD Series 1996 May: Tool Chest

home *** CD-ROM | disk | FTP | other *** search

/ Developer CD Series 1996 May: Tool Chest / Developer CD Series May 1996 (Tool Chest) (Apple Computer) (1996).iso / Tool Chest / Development Tools & Languages / Dylan Related / Mindy / Mindy 1.2 - portable sources / libraries / string-ext / parse.dylan < prev next >

Wrap

Text File | 1995-03-15 | 11.2 KB | 338 lines | [TEXT/ttxt]

module: regular-expressions author: Nick Kramer (nkramer@cs.cmu.edu) copyright: Copyright (C) 1994, Carnegie Mellon University. All rights reserved. rcs-header: $Header: parse.dylan,v 1.1 94/11/08 22:57:41 nkramer Exp $ //====================================================================== // // Copyright (c) 1994 Carnegie Mellon University // All rights reserved. // // Use and copying of this software and preparation of derivative // works based on this software are permitted, including commercial // use, provided that the following conditions are observed: // // 1. This copyright notice must be retained in full on any copies // and on appropriate parts of any derivative works. // 2. Documentation (paper or online) accompanying any system that // incorporates this software, or any part of it, must acknowledge // the contribution of the Gwydion Project at Carnegie Mellon // University. // // This software is made available "as is". Neither the authors nor // Carnegie Mellon University make any warranty about the software, // its performance, or its conformity to any specification. // // Bug reports, questions, comments, and suggestions should be sent by // E-mail to the Internet address "gwydion-bugs@cs.cmu.edu". // //====================================================================== // This is a program to parse regular expressions. The grammar I'm using is: // // <regexp> ::= <alternative> | <alternative>|<regexp> // // <alternative> ::= <quantified-atom> | <quantified-atom><alternative> // // <quantified-atom> ::= <atom> | <atom><quantifier> // // <quantifier> ::= * | + | ? | {n} | {n,} | {n, m} // (where n and m are decimal integers) // // <atom> ::= (<regexp>) | <extended-character> // // See "Programming perl", p. 103-104 for more details. // // Because an assertion is a type of <extended-character>, this will // parse a "quantified assertion", which really isn't a legal regular // expression component. Match.dylan could go into an infinite loop // if given this. define constant <integer?> = union(<integer>, singleton(#f)); define abstract class <parsed-regexp> (<object>) end class <parsed-regexp>; define class (<parsed-regexp>) slot child :: <parsed-regexp>, required-init-keyword: child: ; slot group-number :: <integer>, required-init-keyword: group: ; end class ; define class <union> (<parsed-regexp>) // | slot left :: <parsed-regexp>, required-init-keyword: left: ; slot right :: <parsed-regexp>, required-init-keyword: right: ; end class <union>; define class <alternative> (<parsed-regexp>) // concatenation slot left :: <parsed-regexp>, required-init-keyword: left: ; slot right :: <parsed-regexp>, required-init-keyword: right: ; end class <alternative>; define class <parsed-assertion> (<parsed-regexp>) slot asserts :: <symbol>, required-init-keyword: assertion: ; end class <parsed-assertion>; define class <quantified-atom> (<parsed-regexp>) slot atom :: <parsed-regexp>, required-init-keyword: atom: ; slot min-matches :: <integer>, init-value: 0, init-keyword: min: ; slot max-matches :: <integer?>, init-value: #f, init-keyword: max: ; end class <quantified-atom>; define abstract class <parsed-atom> (<parsed-regexp>) end class <parsed-atom>; define class <parsed-character> (<parsed-atom>) slot character :: <character>, required-init-keyword: character: ; end class <parsed-character>; define class <parsed-set> (<parsed-atom>) slot char-set :: <character-set>, required-init-keyword: set: ; end class <parsed-set>; define class <parsed-backreference> (<parsed-atom>) slot group-number :: <integer>, required-init-keyword: group: ; end class <parsed-backreference>; // <parse-info> contains some information about the current regexp // being parsed. Using a structure is slightly nicer than having // global variables.. // define class <parse-info> (<object>) slot backreference-used, init-value: #f; // Whether or not the function includes \1, \2, etc in the regexp. // This is different from return-marks, which determines whether the // user wants to know about the marks. slot has-alternatives, init-value: #f; slot has-quantifiers, init-value: #f; slot current-group-number, init-value: 0; slot set-type :: <class>, init-keyword: set-type: ; end class <parse-info>; define method parse (s :: <string>, character-set-type :: <class>); let parse-info = make(<parse-info>, set-type: character-set-type); let parse-string = make(<parse-string>, string: s); let parse-tree = make(, group: 0, child: parse-regexp(parse-string, parse-info)); values(parse-tree, parse-info.current-group-number, parse-info.backreference-used, parse-info.has-alternatives, parse-info.has-quantifiers); end method parse; define method parse-regexp(s :: <parse-string>, info :: <parse-info>) => parsed-regexp :: <parsed-regexp>; let alternative = parse-alternative(s, info); if (lookahead(s) = '|') info.has-alternatives := #t; make(<union>, left: alternative, right: parse-regexp(consume(s), info)); else alternative; end if; end method parse-regexp; define method parse-alternative(s :: <parse-string>, info :: <parse-info>) => parsed-regexp :: <parsed-regexp>; let term = parse-quantified-atom(s, info); if (member?(lookahead(s), #(#f, '|', ')'))) term; else make(<alternative>, left: term, right: parse-alternative(s, info)); end if; end method parse-alternative; define method parse-quantified-atom(s :: <parse-string>, info :: <parse-info>) => parsed-regexp :: <parsed-regexp>; let atom = parse-atom(s, info); let char = lookahead(s); select (char by \=) '*' => info.has-quantifiers := #t; consume(s); make(<quantified-atom>, min: 0, atom: atom); '+' => info.has-quantifiers := #t; consume(s); make(<quantified-atom>, min: 1, atom: atom); '?' => info.has-quantifiers := #t; consume(s); make(<quantified-atom>, min: 0, max: 1, atom: atom); '{' => info.has-quantifiers := #t; consume(s); let first-string = make(<deque>); let second-string = make(<deque>); let has-comma = #f; for (c = lookahead(s) then lookahead(s), until c = '}') consume(s); if (c = ',') has-comma := #t; elseif (has-comma) push-last(second-string, c); else push-last(first-string, c); end if; end for; consume(s); // Eat closing brace make(<quantified-atom>, atom: atom, min: string-to-integer(first-string), max: if (~has-comma) string-to-integer(first-string) elseif (empty?(second-string)) #f else string-to-integer(second-string) end if); otherwise => atom; end select; end method parse-quantified-atom; define method parse-atom (s :: <parse-string>, info :: <parse-info>) => parsed-regexp :: <parsed-regexp>; let char = lookahead(s); select (char) '(' => consume(s); // Consume beginning paren info.current-group-number := info.current-group-number + 1; let this-group = info.current-group-number; let regexp = parse-regexp(s, info); if (lookahead(s) ~= ')') error("Unbalanced parens in regexp"); end if; consume(s); // Consume end paren make(, child: regexp, group: this-group); ')' => #f; // Need something to terminate upon seeing a close paren #f => #f; // Signal error? (end of stream) '*', '|', '+' => #f; '\\' => consume(s); // Consume the backslash parse-escaped-character(s, info); '[' => consume(s); // Eat the opening brace let set-string = make(<deque>); // Need something that'll // preserve the right ordering for (char = lookahead(s) then lookahead(s), until char = ']') consume(s); // eat char push-last(set-string, char); if (char = '\\') push-last(set-string, lookahead(s)); consume(s); // eat thing after backslash end if; end for; consume(s); // Eat ending brace make(<parsed-set>, set: make(info.set-type, description: set-string)); '.' => consume(s); dot; '^' => consume(s); make(<parsed-assertion>, assertion: #"beginning-of-string"); '$' => consume(s); make(<parsed-assertion>, assertion: #"end-of-string"); // Insert more special characters here otherwise => let char = lookahead(s); consume(s); make(<parsed-character>, character: char); end select; end method parse-atom; define constant any-char = make(<case-sensitive-character-set>, description: "^\n"); // The useful definitions of all these is in as(<character-set>) // define constant digit-chars = make(<case-sensitive-character-set>, description: "\\d"); define constant not-digit-chars = make(<case-sensitive-character-set>, description: "^\\d"); define constant word-chars = make(<case-sensitive-character-set>, description: "\\w"); define constant not-word-chars = make(<case-sensitive-character-set>, description: "^\\w"); define constant whitespace-chars = make(<case-sensitive-character-set>, description: "\\s"); define constant not-whitespace-chars = make(<case-sensitive-character-set>, description: "^\\s"); define constant dot = make(<parsed-set>, set: any-char); define constant dot-star = make(<quantified-atom>, min: 0, max: #f, atom: dot); // This only handles escaped characters *outside* of a character // set. Inside of a character set is a whole different story. // define method parse-escaped-character (s :: <parse-string>, info :: <parse-info>) => parsed-regexp :: <parsed-regexp>; let next-char = lookahead(s); consume(s); select (next-char) '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' => info.backreference-used := #t; make(<parsed-backreference>, group: digit-to-integer(next-char)); 'n' => make(<parsed-character>, character: '\n'); // Newline 't' => make(<parsed-character>, character: '\t'); // Tab 'f' => make(<parsed-character>, character: '\f'); // Formfeed 'r' => make(<parsed-character>, character: '\r'); // Carriage return 'b' => make(<parsed-assertion>, assertion: #"word-boundary"); 'B' => make(<parsed-assertion>, assertion: #"not-word-boundary"); // Beginning and end of string are not escaped 'd' => make(<parsed-set>, set: digit-chars); 'D' => make(<parsed-set>, set: not-digit-chars); 'w' => make(<parsed-set>, set: word-chars); 'W' => make(<parsed-set>, set: not-word-chars); 's' => make(<parsed-set>, set: whitespace-chars); 'S' => make(<parsed-set>, set: not-whitespace-chars); // Insert more escaped characters here otherwise => make(<parsed-character>, character: next-char); end select; end method parse-escaped-character; define method is-anchored? (regexp :: <parsed-regexp>) => (result :: <boolean>); select (regexp by instance?) => is-anchored?(regexp.child); <alternative> => is-anchored?(regexp.left); <parsed-assertion> => regexp.asserts == #"beginning-of-string"; otherwise => #f; end select; end method is-anchored?;